head(taxi_data)
## # A tibble: 6 × 21
##    ...1 trip_distance rate_code store_and_fwd_flag payment_type fare_amount
##   <dbl>         <dbl>     <dbl> <chr>                     <dbl>       <dbl>
## 1     3         17.0          1 N                             1        49.5
## 2     4         14.4          1 N                             1        45.5
## 3     5         11.6          1 N                             1        42  
## 4    10          5.1          1 N                             1        26.5
## 5    12         11.1          1 N                             1        45.5
## 6    13          9.54         1 N                             1        41  
## # ℹ 15 more variables: extra <dbl>, mta_tax <dbl>, tip_amount <dbl>,
## #   tolls_amount <dbl>, imp_surcharge <dbl>, total_amount <dbl>,
## #   pickup_location_id <dbl>, dropoff_location_id <dbl>, year <dbl>,
## #   month <dbl>, day <dbl>, day_of_week <dbl>, hour_of_day <dbl>,
## #   trip_duration <dbl>, calculated_total_amount <dbl>
summary_stats <- summary(taxi_data)
summary_stats
##       ...1         trip_distance        rate_code      store_and_fwd_flag
##  Min.   :      3   Min.   :   0.010   Min.   : 1.000   Length:1048575    
##  1st Qu.: 270199   1st Qu.:   6.470   1st Qu.: 1.000   Class :character  
##  Median : 542653   Median :   8.700   Median : 1.000   Mode  :character  
##  Mean   : 543661   Mean   :   9.093   Mean   : 1.111                     
##  3rd Qu.: 817071   3rd Qu.:  10.920   3rd Qu.: 1.000                     
##  Max.   :1090739   Max.   :7655.760   Max.   :99.000                     
##   payment_type    fare_amount           extra            mta_tax      
##  Min.   :1.000   Min.   :    0.01   Min.   :-0.4500   Min.   :0.0000  
##  1st Qu.:1.000   1st Qu.:   24.00   1st Qu.: 0.0000   1st Qu.:0.5000  
##  Median :1.000   Median :   29.00   Median : 0.0000   Median :0.5000  
##  Mean   :1.105   Mean   :   31.91   Mean   : 0.3208   Mean   :0.4864  
##  3rd Qu.:1.000   3rd Qu.:   36.00   3rd Qu.: 0.5000   3rd Qu.:0.5000  
##  Max.   :4.000   Max.   : 9999.50   Max.   :18.5000   Max.   :0.5000  
##    tip_amount       tolls_amount     imp_surcharge  total_amount     
##  Min.   :  0.000   Min.   :  0.000   Min.   :0.0   Min.   :    0.31  
##  1st Qu.:  4.560   1st Qu.:  0.000   1st Qu.:0.3   1st Qu.:   30.35  
##  Median :  6.150   Median :  0.000   Median :0.3   Median :   38.47  
##  Mean   :  6.287   Mean   :  2.268   Mean   :0.3   Mean   :   41.58  
##  3rd Qu.:  8.110   3rd Qu.:  5.760   3rd Qu.:0.3   3rd Qu.:   48.36  
##  Max.   :415.000   Max.   :910.900   Max.   :0.6   Max.   :10001.30  
##  pickup_location_id dropoff_location_id      year          month       
##  Min.   :  1        Min.   :  1.0       Min.   :2018   Min.   : 1.000  
##  1st Qu.:132        1st Qu.: 87.0       1st Qu.:2018   1st Qu.: 3.000  
##  Median :138        Median :141.0       Median :2018   Median : 6.000  
##  Mean   :153        Mean   :146.9       Mean   :2018   Mean   : 6.294  
##  3rd Qu.:186        3rd Qu.:229.0       3rd Qu.:2018   3rd Qu.: 9.000  
##  Max.   :265        Max.   :265.0       Max.   :2018   Max.   :12.000  
##       day         day_of_week     hour_of_day    trip_duration   
##  Min.   : 1.00   Min.   :0.000   Min.   : 0.00   Min.   :     1  
##  1st Qu.: 9.00   1st Qu.:1.000   1st Qu.:10.00   1st Qu.:  1449  
##  Median :16.00   Median :3.000   Median :14.00   Median :  1853  
##  Mean   :15.78   Mean   :2.936   Mean   :13.87   Mean   :  2212  
##  3rd Qu.:23.00   3rd Qu.:4.000   3rd Qu.:19.00   3rd Qu.:  2329  
##  Max.   :31.00   Max.   :6.000   Max.   :23.00   Max.   :320031  
##  calculated_total_amount
##  Min.   :    0.31       
##  1st Qu.:   30.35       
##  Median :   38.47       
##  Mean   :   41.50       
##  3rd Qu.:   48.30       
##  Max.   :10001.30
count = c(250000, 500000, 1000000, 1048575)
ggplot(taxi_data, aes(x = trip_distance)) +
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black") +
  xlim(0, 20) +
  scale_y_continuous(labels = scales::comma)
## Warning: Removed 22541 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

  labs(title = "Trip Distance Distribution", x = "Distance (miles)", y = "Count")
## $x
## [1] "Distance (miles)"
## 
## $y
## [1] "Count"
## 
## $title
## [1] "Trip Distance Distribution"
## 
## attr(,"class")
## [1] "labels"
pickup_data <- taxi_data %>%
  group_by(pickup_location_id) %>%
  summarize(avg_distance = mean(trip_distance))
ggplot(pickup_data, aes(x = pickup_location_id, y = avg_distance, color = "red")) +
  geom_line() +
  geom_point() +
  labs(title = "Average Distance by Pickup ID",
       x = "Pickup ID", 
       y = "Average Distance") +
  theme_minimal()

dropoff_data <- taxi_data %>%
  group_by(dropoff_location_id) %>%
  summarize(avg_distance = mean(trip_distance))
ggplot(dropoff_data, aes(x = dropoff_location_id, y = avg_distance, color = "red")) +
  geom_line() +
  geom_point() +
  labs(title = "Average Distance by Dropoff ID",
       x = "Dropoff ID", 
       y = "Average Distance") +
  theme_minimal()

count = c(250000, 500000, 1000000, 1048575)
ggplot(taxi_data, aes(x = pickup_location_id)) +
  geom_bar(binwidth = 1, fill = "yellow", color = "black") +
  xlim(1, 265) +
  scale_y_continuous(labels = scales::comma)
## Warning in geom_bar(binwidth = 1, fill = "yellow", color = "black"): Ignoring
## unknown parameters: `binwidth`
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

  labs(title = "Pickup Location Distribution", x = "Pickup ID", y = "Count")
## $x
## [1] "Pickup ID"
## 
## $y
## [1] "Count"
## 
## $title
## [1] "Pickup Location Distribution"
## 
## attr(,"class")
## [1] "labels"
count = c(250000, 500000, 1000000, 1048575)
ggplot(taxi_data, aes(x = dropoff_location_id)) +
  geom_bar(binwidth = 1, fill = "yellow", color = "black") +
  xlim(1, 265) +
  scale_y_continuous(labels = scales::comma)
## Warning in geom_bar(binwidth = 1, fill = "yellow", color = "black"): Ignoring
## unknown parameters: `binwidth`
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

  labs(title = "Dropoff Location Distribution", x = "Dropoff ID", y = "Count")
## $x
## [1] "Dropoff ID"
## 
## $y
## [1] "Count"
## 
## $title
## [1] "Dropoff Location Distribution"
## 
## attr(,"class")
## [1] "labels"
taxi_data <- taxi_data %>%
  mutate(region = case_when(
    pickup_location_id %in% c(3,18,20,31,32,46,47,51,58,59,69,78,81,94,119,126,136,147,159,167,168,169,174,182,183,184,185,199,200,208,212,213,220,235,240,241,242,247,248,250,254,259) ~ "Bronx",
    pickup_location_id %in% c(2,7,8,9,10,15,16,19,27,28,30,38,53,56,64,70,73,82,83,86,92,93,95,96,98,101,102,117,121,122,124,129,130,131,132,134,135,138,139,145,146,157,160,171,173,175,179,180,191,192,193,196,197,198,201,203,205,207,215,216,218,219,223,226,252,253,258,260) ~ "Queens",
    pickup_location_id %in% c(4,12,13,24,41,42,43,45,48,50,68,74,75,79,87,88,90,100,103,107,113,114,116,120,125,127,128,137,140,141,142,143,144,148,151,152,153,158,161,162,163,164,166,170,186,194,202,209,211,224,229,230,231,232,233,234,236,237,238,239,243,244,246,249,261,262,263) ~ "Manhattan",
    pickup_location_id %in%
c(11,14,17,21,22,25,26,29,33,34,35,36,37,39,40,49,52,54,55,61,62,63,65,66,67,71,72,76,77,80,85,89,91,97,106,108,111,112,123,133,149,150,154,155,165,177,178,181,188,189,190,195,210,217,222,225,227,228,255,256,257) ~ "Brooklyn",
  pickup_location_id %in%
c(5,6,23,44,84,99,109,110,115,118,156,172,176,187,204,206,214,221,245,251) ~ "Staten Island"
  ))
ggplot(taxi_data, aes(x = region)) +
  geom_bar(fill = "steelblue") +
  scale_y_continuous(labels = comma) +
  labs(
    title = "Number of Trips by Region",
    x = "Region",
    y = "Trip Count"
  ) +
  theme_minimal()

taxi_data <- taxi_data %>%
  mutate(
    day_of_week = factor(day_of_week,
                         levels = 0:6,
                         labels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
  )
region_summary <- taxi_data %>%
  group_by(region) %>%
  summarise(
    avg_trip_distance = mean(trip_distance, na.rm = TRUE),
    avg_total_amount = mean(total_amount, na.rm = TRUE),
    most_common_day = names(sort(table(day_of_week), decreasing = TRUE))[1],
    most_common_hour = as.integer(names(sort(table(hour_of_day), decreasing = TRUE))[1]),
    trip_count = n()
  )

print(region_summary)
## # A tibble: 6 × 6
##   region     avg_trip_distance avg_total_amount most_common_day most_common_hour
##   <chr>                  <dbl>            <dbl> <chr>                      <int>
## 1 Bronx                  11.1              45.4 Monday                         7
## 2 Brooklyn                8.94             37.5 Thursday                       8
## 3 Manhattan               7.83             37.5 Wednesday                     22
## 4 Queens                 11.5              49.1 Sunday                        21
## 5 Staten Is…             17.3              81.4 Thursday                      13
## 6 <NA>                    8.86             47.2 Wednesday                     15
## # ℹ 1 more variable: trip_count <int>
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# Basic plotly bar chart
region_counts <- taxi_data %>%
  count(region)

plot_ly(
  data = region_counts,
  x = ~region,
  y = ~n,
  type = "bar",
  text = ~paste("Trips:", n),
  hoverinfo = "text",
  marker = list(color = 'steelblue')
) %>%
  layout(
    title = "Number of Trips by Region",
    xaxis = list(title = "Region"),
    yaxis = list(title = "Trip Count", tickformat = ",")  # comma format numbers
  )
## Warning: Ignoring 1 observations
pickup_grid <- taxi_data %>%
  count(day_of_week, hour_of_day)

ggplot(pickup_grid, aes(x = hour_of_day, y = day_of_week, fill = n)) +
  geom_tile(color = "white") +
  scale_fill_viridis_c(option = "C") +
  labs(
    title = "NYC Taxi Pickups by Hour and Day",
    x = "Hour of Day",
    y = "Day of Week",
    fill = "Pickup Count"
  ) +
  theme_minimal()